/*******************************************************************************************************************************************/
/* Program adapted from Pierre Pora's work                                                                                                 */
/*This program takes labor market data (DADS panel), at the level individual * firm * year and aggregates it at the individual * year level*/
/*  Selection of observations and variables is aligned on Olivier Godechot's work on DADS postes when possible                             */
/*******************************************************************************************************************************************/

/*Where the input data (DADS panel) is stored*/
libname pan76 "X:\HAB-DADS-Mise-a-disposition\HAB_A118019B_DPANS7693SAS" ; 
libname pan94 "X:\HAB-DADS-Mise-a-disposition\HAB_A118019B_DPANS9401SAS" ; 
libname pan02 "X:\HAB-DADS-Mise-a-disposition\HAB_A118019B_DPANS0207SAS" ; 
libname pan08 "X:\HAB-DADS-Mise-a-disposition\HAB_A118019B_DPANS0812SAS" ; 
libname pan13 "X:\HAB-DADS-Mise-a-disposition\HAB_A118019B_DPANS1319SAS" ; 

/*Where the output data will be stored*/
%let DATADIR=X:\HAB-deep-learning-pour-ssp\blm_data ; 
libname lib2 "&DATADIR.";

/*Where the code is stored*/
%let CODEDIR=%qsubstr(%sysget(SAS_EXECFILEPATH),1,%length(%sysget(SAS_EXECFILEPATH))-%length(%sysget(SAS_EXECFILEname))); 

/*SAS macros that aggregate either numerical variables (e.g. earnings), categorical variables (e.g. occupation) or durations (e.g. working days)*/ 

%INCLUDE "&CODEDIR.\0BIS-MACROS_AGGREG.sas" ; 

/*Display errors in log*/

OPTIONS MPRINT ERRORS=1 NOOVP PAGESIZE=MAX LINESIZE=72;
options fullstimer;
options BUFSIZE=32k BUFNO=2k;

/*Load data from 1976 to 2019*/

data dads_panel(rename=(nninouv=indiv_id an=year annai=yearbirth sx=sex)) ; 
set pan76.PAN19_19761993
	pan94.PAN19_19942001
	pan02.PAN19_20022007
	pan08.PAN19_20082012 
	pan13.PAN19_20132019 ; 
if an>=1976;
if an<2020;
/*Gross earnings correction*/
if an>2013 then sbr=bascsgr ;
if an>2013 then sb=bascsg ;
run ;

/* checks on the DOMEMPL break in 2009 */
proc freq data=dads_panel(where=(year=2008)); tables DOMEMPL_EMPL/missing; run;
proc freq data=dads_panel(where=(year=2009)); tables DOMEMPL_EMPL/missing; run;
/* There is apparently no break... More details :*/
data d8; set dads_panel(where=((FILTRE="1") and (year<2009))); run;
data d8epic; set d8; if SIR in ("380129866","444608442","444619258","444786511","552043002","420495178","552081317","542107651"); run;
proc freq data=d8epic; tables DOMEMPL_EMPL*SIR/missing; run;
proc freq data=d8epic; tables CATJUR*SIR/missing; run;

data d9; set dads_panel(where=(FILTRE="1" & year>2008)); run;
data d9epic; set d9; if SIR in ("380129866","444608442","444619258","444786511","552043002","420495178","552081317","542107651"); run;
proc freq data=d9epic; tables DOMEMPL_EMPL*SIR/missing; run;
proc freq data=d9epic; tables CATJUR*SIR/missing; run;

proc means data=dads_panel P10 Median P90 Mean; 
class year;
var sbr sb netnet netnetr;
run;

proc freq data=dads_panel; table DOMEMPL_EMPL*SECT/missing; run;

proc freq data=dads_panel; table DOMEMPL_EMPL*CATJUR/missing; run;

proc freq data=dads_panel; table DOMEMPL_EMPL*PTT/missing; run;

proc freq data=dads_panel; table APEN*DOMEMPL_EMPL/missing; run;

proc freq data=dads_panel; table YEAR*DOMEMPL_EMPL/missing; run;

proc freq data=dads_panel; table YEAR*CS2/missing; run;
proc freq data=dads_panel; table YEAR*CS2_ANC/missing; run;

proc freq data=dads_panel; table YEAR*ST/missing; run;

proc freq data=dads_panel; table YEAR*sex/missing; run;

proc freq data=dads_panel(where=(year>1985)); table DOMEMPL*ST/missing; run;
proc freq data=dads_panel(where=(year>1985)); table SECT*ST/missing; run;
proc freq data=dads_panel(where=(year<1986)); table SECT*ST/missing; run;

proc freq data=dads_panel(where=(year<1986)); table SECT*APEN/missing; run;


data dads_panel; set dads_panel;
/* Filter variable corresponding to the Insee "private employer" field */
/* Excludes all public sector, excludes salaried employees directly employed by households*/
if (CATJUR ne '0000' and substr(CATJUR,1,1) ne '7' and DOMEMPL not in ('1' '2' '3' '7') 
and NOT( SUBSTR(SIR,1,1) = '1' ) and PTT='0' and SECT='PRIV' 
/*Excludes unemployment benefits*/
and CE NE 'A' 
/*Excludes observations with improper individual identifier*/
and NNIFICT='0' 
/*Excludes agricultural sector*/
and NES5 ne 'ES')
then filtre_Insee=1;
else filtre_Insee=0;

/* Filter variable corresponding to our 2002-2016 field */
/* Excludes public firms and administrations, and individual firms */
if (CATJUR ne '0000' and substr(CATJUR,1,1) ne '7' and DOMEMPL not in ('1' '2' '3' '4' '5' '6' '7')
and NOT( SUBSTR(SIR,1,1) = '1' ) and CE NE 'A' and NNIFICT='0' and PTT='0' and SECT='PRIV' and NES5 ne 'ES'
/* Exclude public owned firms before 1986 */
and (year>1985 or (year<1986 and ST ne '5')) 
/* Excludes interns and apprentices since 1984 */
and substr(CS2, 1, 1) ne '7' 
/*Excludes individuals with missing sex variable*/
and missing(sex)=0)
then filtre_BGP=1;
else filtre_BGP=0;
run;

proc freq data=dads_panel; table filtre_Insee*filtre_BGP/missing; run;

/*Sort panel before aggregation*/
proc sort data=dads_panel(where=(filtre_Insee=1))
out=dads_panelf ; 
by 
/*Individual identifier (NIR)*/
	indiv_id
/*Year*/
	year
/*Gross labor earnings (paid by a specific employer)*/
	sbr
/*Working days (for a given employer)*/
	dp 
/*Working hours (for a given employer)*/
	nbheur 
/*Firm identifier*/
	sir ; 
run;

proc means data=dads_panelf P10 Median P90 Mean; 
class year;
var sbr sb netnet netnetr BASCSG BASCSGR S_BRUT S_BRUTR;
output out=wagestat mean=Moyenne P10=decile_1 Median=mediane P90=decile_9;
run;



/*Aggregate data at the individual * year level*/

DATA labor_market_data(
	sgio=yes 
	sortedby=indiv_id year
	compress=yes 
	KEEP= 
/*Earnings measures*/
	/*Total real net earnings (2015 euros)*/
		earnings_net_r 
	/*Total real gross earnings (2015 euros)*/
		earnings_gross_r
	/*Total real gross earnings*/
		earnings_gross
	/*Total nominal net earnings (euros)*/
		earnings_net_n
/*Working time measures*/
	/*Total working days*/
		days
	/*Total working hours*/
		hours 
	/*Full-time status*/
		fulltime
/*Industry*/
	/*1-digit industry*/
		industry
/*Occupation*/
	/*1-digit occupation*/
		occupation
/*Demographic variables*/
	/*Sex*/
		sex
	/*Year of birth*/
		yearbirth
	/* Age*/
		age
/*Identifier*/
	/*Individual identifier*/
		indiv_id
	/*Year*/
		year
	/*Main employer (firm) identifier*/
		firm_id
	);

SET dads_panelf ;
BY indiv_id year sbr dp;

IF CS1='X' THEN CS1='';
%AGREGMAJ(VARIN=CS1,VAROUT=occupation,PER=year);

%AGREGMAJ(VARIN=SIR,VAROUT=firm_id,PER=year);
%AGREGMAJ(VARIN=NES5,VAROUT=industry,PER=year);

%AGREGMAJ(VARIN=NETNETR,VAROUT=earnings_net_r,PER=year);
%AGREGMAJ(VARIN=NETNET,VAROUT=earnings_net_n,PER=year);
%AGREGMAJ(VARIN=SBR,VAROUT=earnings_gross_r,PER=year);
%AGREGMAJ(VARIN=SB,VAROUT=earnings_gross,PER=year);

DP2=DP;IF DP=0 THEN DP2=.;
%AGREGMAJ(VARIN=DP2,VAROUT=days,PER=year);
IF DPTOT>360 THEN DPTOT=360;
NBH2=NBHEUR;IF NBHEUR=0 THEN NBH2=.;
%AGREGMAJ(VARIN=NBH2,VAROUT=hours,PER=year);

IF CE NE ''  THEN TNC=( CE NE 'C');

%AGREGBIN(VARIN=TNC,VAROUT=TNC_TOT,PER=year);
length fulltime 3.;
fulltime=1-TNC_TOT;
;

IF LAST.year THEN OUTPUT;

RUN;

proc means data=labor_market_data P10 Median P90 Mean; 
class year;
var earnings_gross_r earnings_gross earnings_net_n earnings_net_r;
run;

DATA lib2.labor_market_data_i;
SET labor_market_data;
if year=1976 then smic=8.94/6.55957; 
else if year=1977 then smic=10.06/6.55957; 
else if year=1978 then smic=11.31/6.55957; 
else if year=1979 then smic=12.93/6.55957; 
else if year=1980 then smic=14.79/6.55957; 
else if year=1980 then smic=14.79/6.55957; 
else if year=1981 then smic=17.76/6.55957;
else if year=1982 then smic=20.29/6.55957;
else if year=1983 then smic=22.33/6.55957;
else if year=1984 then smic=24.36/6.55957;
else if year=1985 then smic=26.04/6.55957;
else if year=1986 then smic=26.92/6.55957;
else if year=1987 then smic=27.84/6.55957;
else if year=1988 then smic=28.76/6.55957;
else if year=1989 then smic=29.91/6.55957;
else if year=1990 then smic=31.94/6.55957;
else if year=1991 then smic=32.66/6.55957;
else if year=1992 then smic=34.06/6.55957;
else if year=1993 then smic=34.83/6.55957;
else if year=1994 then smic=35.56/6.55957;
else if year=1995 then smic=36.98/6.55957;
else if year=1996 then smic=37.91/6.55957;
else if year=1997 then smic=39.43/6.55957;
else if year=1998 then smic=40.22/6.55957;
else if year=1999 then smic=40.72/6.55957;
else if year=2000 then smic=42.02/6.55957;
else if year=2001 then smic=43.72/6.55957;
else if year=2002 then smic=6.83;
else if year=2003 then smic=7.19;
else if year=2004 then smic=7.61;
else if year=2005 then smic=8.03;
else if year=2006 then smic=8.27;
else if year=2007 then smic=8.44;
else if year=2008 then smic=8.63;
else if year=2009 then smic=8.82;
else if year=2010 then smic=8.86;
else if year=2011 then smic=9.19;
else if year=2012 then smic=9.40;
else if year=2013 then smic=9.43;
else if year=2014 then smic=9.53;
else if year=2015 then smic=9.61;
else if year=2016 then smic=9.67;
else if year=2017 then smic=9.76;
else if year=2018 then smic=9.88;
else if year=2019 then smic=10.03;
else if year=2020 then smic=10.15;
else if year=2021 then smic=10.25;

threshold=0.8*smic;
hourlywage = earnings_gross/hours;
included= DAYS>=90 and missing(age)=0 and missing(sex)=0;

if included=1;

age_2=age*age;
age_3=age*age*age;
RUN;

proc means data=lib2.labor_market_data_i P10 Median P90 Mean Var; 
class year;
var earnings_gross_r earnings_gross earnings_net_n earnings_net_r hourlywage;
run;

proc freq data=lib2.labor_market_data; table fulltime*YEAR; run;

proc freq data=pan76.PAN17_19761993; table sx*an; run;

data essai; set labor_market_data;
if firm_id="368500708";
run;

data essai2; set dads_panel;
if SIR="368500708";
run;



/* Principaux probl�me pour la s�rie longue :

DOMEMPL starts in 1986*/
/*TNC not present in 76-93 file */
